library(tidyverse)
library(lubridate)
library(plotly)
library(skimr)
theme_set(theme_minimal())I decided to join regression challenge because it is basically a monthly time-series!
df_case <- read.csv("data/case_cost_prediction_train.csv")
df_case <- df_case %>%
mutate(tglpelayanan = as.Date(tglpelayanan))
df_case <- df_case %>%
arrange(kddati2, tkp, tglpelayanan)
# Validation data
df_case_val <- read.csv("data/case_cost_prediction_val.csv")
df_case_val <- df_case_val %>%
mutate(tglpelayanan = as.Date(tglpelayanan))
df_case_val <- df_case_val %>%
arrange(kddati2, tkp, tglpelayanan)
# Combine
df_case_all <- bind_rows(
mutate(df_case, cat = "Train"),
mutate(df_case_val, cat = "Test")
) %>%
arrange(kddati2, tkp, tglpelayanan)
# Small features engineering
df_case <- df_case %>%
mutate(m = month(tglpelayanan),
y = year(tglpelayanan)) %>%
mutate(comb = paste0(a, b, c, cb, d, ds, gd, hd, i1, i2, i3, i4, kb, kc, kg, ki, kj, kk, kl, km, ko, kp, kt, ku,
s, sa, sb, sc, sd))
df_monthly <- df_case %>%
group_by(tglpelayanan, tkp) %>%
summarise(total_case = sum(case),
total_cost = sum(unit_cost * case),
avg_cost = total_cost / total_case,
kd = n_distinct(kddati2),
case_per_kd = total_case / kd,
cost_per_kd = total_cost / kd)
# KD level
df_kd <- df_case %>%
group_by(kddati2, tglpelayanan, tkp, comb) %>%
summarise(cnt = n(),
peserta = sum(peserta),
case = sum(case),
case_ratio = case / peserta,
unit_cost = sum(unit_cost),
cost_per_case = unit_cost / case)df_case %>%
count(comb, sort = TRUE)Validation
df_case_val %>%
count(kddati2)Not existed in testing
df_case %>%
count(kddati2, tkp) %>%
rename(n_train = n) %>%
anti_join(
df_case_val %>%
count(kddati2, tkp) %>%
rename(n_test = n)
)How many dates?
p <- df_case %>%
count(tglpelayanan) %>%
ggplot(aes(tglpelayanan, n)) +
geom_line()
ggplotly(p)How many cases per date?
p <- df_monthly %>%
ggplot(aes(tglpelayanan, total_case, color = factor(tkp))) +
geom_line()
ggplotly(p)Unit cost?
p <- df_monthly %>%
filter(tkp == 30, tglpelayanan < ymd(20210601)) %>%
ggplot(aes(tglpelayanan, avg_cost, color = factor(tkp))) +
geom_line()
ggplotly(p)p <- df_monthly %>%
filter(tkp == 40, tglpelayanan < ymd(20210601)) %>%
ggplot(aes(tglpelayanan, avg_cost, color = factor(tkp))) +
geom_line()
ggplotly(p)df_case %>%
filter(tkp == 30) %>%
select(row_id, tglpelayanan, kddati2) %>%
mutate(cat = "Train") %>%
bind_rows(
df_case_val %>%
filter(tkp == 30) %>%
select(row_id, tglpelayanan, kddati2) %>%
mutate(cat = "Test")
) %>%
mutate(kddati2 = as.factor(kddati2)) %>%
ggplot(aes(kddati2, tglpelayanan, fill = cat)) +
coord_flip(expand = c(0,0)) +
geom_tile() +
theme_minimal()df_casedf_case %>%
select(peserta, case, unit_cost) %>%
corrr::correlate()df_case_wider <- df_case %>%
select(tglpelayanan, kddati2, tkp, peserta, case, unit_cost) %>%
pivot_wider(id_cols = c(tglpelayanan, kddati2),
names_from = tkp, values_from = c(peserta, case, unit_cost)) %>%
arrange(kddati2, tglpelayanan) %>%
mutate(peserta = coalesce(peserta_30, peserta_40)) %>%
mutate_at(vars(case_30, case_40, unit_cost_30, unit_cost_40, peserta),
funs(lag1 = lag(., 1),
lag2 = lag(., 2),
lag3 = lag(., 3),
lead1 = lead(., 1),
lead2 = lead(., 2),
lead3 = lead(., 3)))
# 20,206
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40) %>%
drop_na() %>%
corrr::correlate()# 10,802
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lag1")) %>%
drop_na() %>%
corrr::correlate() %>%
corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>%
arrange(desc(r))# 10,802
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lead1")) %>%
drop_na() %>%
corrr::correlate() %>%
corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>%
filter(x %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40") |
y %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40")) %>%
arrange(desc(r))# 5,755
df_case_wider %>%
select(peserta, case_30, case_40, unit_cost_30, unit_cost_40, contains("lead1"), contains("lag1")) %>%
drop_na() %>%
corrr::correlate() %>%
corrr::stretch(na.rm = TRUE, remove.dups = TRUE) %>%
filter(x %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40") |
y %in% c("case_30", "case_40", "unit_cost_30", "unit_cost_40")) %>%
arrange(desc(r))viz_kd <- function(id) {
res <- df_case_all %>%
filter(kddati2 == id)
res %>%
group_by(tglpelayanan, metrics = "peserta") %>%
summarise(val = mean(peserta, na.rm = TRUE)) %>%
ungroup() %>%
bind_rows(
res %>%
filter(tkp == 30) %>%
transmute(tglpelayanan, metrics = "case_30", val = case, cat),
res %>%
filter(tkp == 40) %>%
transmute(tglpelayanan, metrics = "case_40", val = case, cat)
) %>%
ggplot(aes(tglpelayanan, val, fill = metrics)) +
geom_line() +
facet_wrap(~metrics, scales = "free_y", nrow = 3) +
labs(subtitle = id)
}
viz_kd(10)Median baseline is not good enough, let’s hack the submission
df_pred <- df_case %>%
group_by(kddati2) %>%
mutate(predict_case = median(case),
predict_unit_cost = median(unit_cost)) %>%
ungroup() %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost)
set.seed(100)
case_random <- runif(nrow(df_case), 0.85, 1.15)
set.seed(1000)
unit_cost_random <- runif(nrow(df_case), 0.95, 1.07)
df_pred <- df_case %>%
mutate(case_random_ = case_random,
unit_cost_random_ = unit_cost_random) %>%
mutate(predict_case = case * case_random_,
predict_unit_cost = unit_cost * unit_cost_random) %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost)
mape <- function(y_actual, y_pred) {
mean(abs( (y_actual - y_pred) / y_actual ))
}
mae <- function(y_actual, y_pred) {
mean(abs( (y_actual - y_pred) ))
}
# MAPE
mape(df_pred$unit_cost, df_pred$predict_unit_cost)## [1] 0.03080327
mape(df_pred$case, df_pred$predict_case)## [1] 0.07498841
# MAE
mae(df_pred$unit_cost, df_pred$predict_unit_cost)## [1] 60476.69
mae(df_pred$case, df_pred$predict_case)## [1] 489.5009
Baseline average previous and after value
df_pred <- df_case %>%
group_by(kddati2, tkp) %>%
mutate(predict_case = (coalesce(lag(case), lead(case)) + coalesce(lead(case), lag(case))) / 2,
predict_unit_cost = (coalesce(lag(unit_cost), lead(unit_cost)) + coalesce(lead(unit_cost), lag(unit_cost))) / 2) %>%
ungroup() %>%
select(row_id, case, predict_case, unit_cost, predict_unit_cost) %>%
filter(!is.na(predict_case), !is.na(predict_unit_cost))
# MAPE
mape(df_pred$unit_cost, df_pred$predict_unit_cost)## [1] 0.04524511
mape(df_pred$case, df_pred$predict_case)## [1] 1.90452
# MAE
mae(df_pred$unit_cost, df_pred$predict_unit_cost)## [1] 77977.58
mae(df_pred$case, df_pred$predict_case)## [1] 641.8952
df_pred <- df_pred %>%
mutate(pe = abs( (unit_cost - predict_unit_cost) / unit_cost ) )# Format File Tidak Sesuai
write.csv(df_pred, "submission/tahap1_case_cost_prediction.csv", row.names = FALSE, quote = FALSE)
# Sukses :)
write.csv(df_pred %>%
select(row_id, predict_case, predict_unit_cost),
"submission/tahap1_case_cost_prediction.csv", row.names = FALSE, quote = FALSE)test_fe <- df_case_val %>%
select(row_id, tglpelayanan, kddati2, tkp, peserta) %>%
mutate(cat = "Test") %>%
bind_rows(
df_case %>%
select(row_id, tglpelayanan, kddati2, tkp, peserta, case, unit_cost) %>%
mutate(cat = "Train")
) %>%
arrange(kddati2, tkp, tglpelayanan) %>%
group_by(kddati2, tkp) %>%
mutate_at(vars(case, peserta),
funs(lag1 = lag(., 1),
lag2 = lag(., 2),
lag3 = lag(., 3),
lag6 = lag(., 6),
lead1 = lead(., 1),
lead2 = lead(., 2),
lead3 = lead(., 3),
lead6 = lead(., 6),)) %>%
ungroup()
test_fe %>%
group_by(cat) %>%
summarise_if(is.numeric, ~sum(ifelse(!is.na(.), 1, 0))) %>%
gather(key, val, -cat) %>%
group_by(cat) %>%
mutate(val_max = max(val)) %>%
filter(grepl("lag|lead", key)) %>%
mutate(val_pct = val / val_max)